Run download_data.Rmd and percentage_of_regional_richness.Rmd First!
city_data
fetch_city_data_for <- function(pool_name, include_city_name = F, include_pool_size = T) {
results_filename <- paste(paste(pool_name, 'city', 'richness', 'intercept', sep = "_"), "csv", sep = ".")
results <- read_csv(results_filename)
joined <- left_join(city_data, results)
pool_size_col_name <- paste(pool_name, 'pool', 'size', sep = "_")
required_columns <- c("population_growth", "rainfall_monthly_min", "rainfall_annual_average", "rainfall_monthly_max", "temperature_annual_average", "temperature_monthly_min", "temperature_monthly_max", "happiness_negative_effect", "happiness_positive_effect", "happiness_future_life", "number_of_biomes", "realm", "biome_name", "region_20km_includes_estuary", "region_50km_includes_estuary", "region_100km_includes_estuary", "city_includes_estuary", "region_20km_average_pop_density", "region_50km_average_pop_density", "region_100km_average_pop_density", "city_max_pop_density", "city_average_pop_density", "mean_population_exposure_topm2_5_2019", "region_20km_cultivated", "region_20km_urban", "region_50km_cultivated", "region_50km_urban", "region_100km_cultivated", "region_100km_urban", "region_20km_elevation_delta", "region_20km_mean_elevation", "region_50km_elevation_delta", "region_50km_mean_elevation", "region_100km_elevation_delta", "region_100km_mean_elevation", "city_elevation_delta", "city_mean_elevation", "urbanPshrubs", "permanent_water", "open_forest", "herbaceous_wetland", "herbaceous_vegetation", "cultivated", "closed_forest", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_streets", "percentage_urban_area_as_open_public_spaces_and_streets", "percentage_urban_area_as_open_public_spaces", "city_gdp_per_population", "city_ndvi", "city_ssm", "city_susm", "region_20km_ndvi", "region_20km_ssm", "region_20km_susm", "region_50km_ndvi", "region_50km_ssm", "region_50km_susm", "region_100km_ndvi", "region_100km_ssm", "region_100km_susm", "city_percentage_protected", "region_20km_percentage_protected", "region_50km_percentage_protected", "region_100km_percentage_protected")
if (include_pool_size) {
required_columns <- append(c(pool_size_col_name), required_columns)
}
if (include_city_name) {
required_columns <- append(c("name"), required_columns)
}
required_columns <- append(c("response"), required_columns)
joined[,required_columns]
}
merlin_city_data <- fetch_city_data_for('merlin')
merlin_city_data
library(car)
Loading required package: carData
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
The following object is masked from ‘package:purrr’:
some
The following object is masked from ‘package:boot’:
logit
merlin_city_data_fixed <- rfImpute(response ~ ., merlin_city_data)
merlin_city_data_fixed
source('./random_forest_selection_functions.R')
scale_parameter_name <- function(scale, postscript) {
paste('region', paste(scale, 'km', sep = ''), postscript, sep = '_')
}
scale_parameters <- function(postscript) {
c(scale_parameter_name(20, postscript), scale_parameter_name(50, postscript), scale_parameter_name(100, postscript))
}
scales_parameters_without <- function(scale_to_exclude, postscript) {
scales <- scale_parameters(postscript)
scales[scales != scale_parameter_name(scale_to_exclude, postscript)]
}
select_scales <- function(urban, cultivated, elevation_delta, mean_elevation, average_pop_density, includes_estuary, ssm, susm, ndvi, percentage_protected) {
append(
append(
append(
append(
scales_parameters_without(scale_to_exclude = urban, postscript = 'urban'),
scales_parameters_without(scale_to_exclude = cultivated, postscript = 'cultivated')
),
append(
scales_parameters_without(scale_to_exclude = elevation_delta, postscript = 'elevation_delta'),
scales_parameters_without(scale_to_exclude = mean_elevation, postscript = 'mean_elevation')
)
),
append(
append(
scales_parameters_without(scale_to_exclude = average_pop_density, postscript = 'average_pop_density'),
scales_parameters_without(scale_to_exclude = includes_estuary, postscript = 'includes_estuary')
),
append(
scales_parameters_without(scale_to_exclude = ssm, postscript = 'ssm'),
scales_parameters_without(scale_to_exclude = susm, postscript = 'susm')
)
)
),
append(
scales_parameters_without(scale_to_exclude = ndvi, postscript = 'ndvi'),
scales_parameters_without(scale_to_exclude = percentage_protected, postscript = 'percentage_protected')
)
)
}
select_scales(urban = 20, cultivated = 100, elevation_delta = 20, mean_elevation = 100, average_pop_density = NA, includes_estuary = NA, ssm = 20, susm = 20, ndvi = 100, percentage_protected = NA)
select_scales(urban = , cultivated = , elevation_delta = , mean_elevation = , average_pop_density = , includes_estuary = , ssm = , susm = , ndvi =, percentage_protected = )
select_variables_from_random_forest(merlin_city_data_fixed)
exclude_merlin <- !names(merlin_city_data_fixed) %in% select_scales(urban = 20, cultivated = 100, elevation_delta = 50, mean_elevation = 20, average_pop_density = 50, includes_estuary = NA, ssm = 100, susm = 50, ndvi = 20, percentage_protected = 50)
merlin_city_data_fixed_single_scale <- merlin_city_data_fixed[,exclude_merlin]
merlin_city_data_fixed_single_scale
select_variables_from_random_forest(merlin_city_data_fixed_single_scale)
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max", "city_average_pop_density")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max", "city_average_pop_density", "region_50km_average_pop_density")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max", "city_average_pop_density", "region_50km_average_pop_density", "rainfall_annual_average")])
“merlin_pool_size”, “biome_name”, “realm”
birdlife_city_data <- fetch_city_data_for('birdlife')
birdlife_city_data
birdlife_city_data_fixed <- rfImpute(response ~ ., birdlife_city_data)
birdlife_city_data_fixed
select_variables_from_random_forest(birdlife_city_data_fixed)
exclude_birdlife <- !names(birdlife_city_data_fixed) %in% select_scales(urban = 100, cultivated = 100, elevation_delta = 20, mean_elevation = 100, average_pop_density = 20, includes_estuary = NA, ssm = 50, susm = 100, ndvi = 100, percentage_protected = 100)
birdlife_city_data_fixed_single_scale <- birdlife_city_data_fixed[,exclude_birdlife]
birdlife_city_data_fixed_single_scale
select_variables_from_random_forest(birdlife_city_data_fixed_single_scale)
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average", "city_susm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average", "city_susm", "region_100km_ndvi")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average", "city_susm", "region_100km_ndvi", "happiness_future_life")])
“population_growth”, “region_50km_ssm”, “birdlife_pool_size”
| So…. |
|---|
| Merlin: “merlin_pool_size”, “biome_name”, “realm” Birdlife: “population_growth”, “region_50km_ssm”, “birdlife_pool_size” |
r ggplot(merlin_city_data_fixed, aes(x = merlin_pool_size, y = response, color = realm)) + geom_point() + geom_smooth(method = "glm", se = F) + theme(legend.position = "bottom") |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(merlin_city_data_fixed, aes(x = merlin_pool_size, y = response, color = biome_name)) + geom_point() + geom_smooth(method = "glm", se = F) + theme(legend.position = "bottom") |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(birdlife_city_data_fixed, aes(x = birdlife_pool_size, y = response, color = region_50km_ssm)) + geom_point() + geom_smooth(method = "glm", se = F) + theme(legend.position = "bottom") |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(birdlife_city_data_fixed, aes(x = birdlife_pool_size, y = response, color = population_growth)) + geom_point() + geom_smooth(method = "glm", se = F) + theme(legend.position = "bottom") |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(merlin_city_data_fixed, aes(y = response, x = population_growth)) + geom_point() + geom_smooth(method = "glm", se = F) |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(birdlife_city_data_fixed, aes(y = response, x = population_growth)) + geom_point() + geom_smooth(method = "glm", se = F) |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(merlin_city_data_fixed, aes(y = response, x = region_50km_ssm)) + geom_point() + geom_smooth(method = "glm", se = F) |
`geom_smooth()` using formula 'y ~ x' |
r ggplot(birdlife_city_data_fixed, aes(y = response, x = region_50km_ssm)) + geom_point() + geom_smooth(method = "glm", se = F) |
`geom_smooth()` using formula 'y ~ x' |
library(boot)
merlin_city_data_fixed_no_boreal <- merlin_city_data_fixed[merlin_city_data_fixed$biome_name != 'Boreal Forests/Taiga',]
birdlife_city_data_fixed_no_boreal <- birdlife_city_data_fixed[birdlife_city_data_fixed$biome_name != 'Boreal Forests/Taiga',]
test_model <- function(data, formula) {
fit <- glm(formula, data = data)
cv.glm(data, fit)$delta
print(paste("R2", with(summary(fit), 1 - deviance/null.deviance)))
print(paste("CV Delta", cv.glm(data, fit)$delta))
}
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size)
[1] "R2 0.285894381786357"
[1] "CV Delta 13.2924069067549" "CV Delta 13.290989425668"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size)
[1] "R2 0.132747072834318"
[1] "CV Delta 5.61376539055778" "CV Delta 5.61321468528147"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + realm)
[1] "R2 0.355479718662977"
[1] "CV Delta 13.1013113338907" "CV Delta 13.0956030666681"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + realm)
[1] "R2 0.215771844466201"
[1] "CV Delta 5.38032952583311" "CV Delta 5.37866865996081"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + biome_name)
[1] "R2 0.370210675877385"
[1] "CV Delta 13.3828176878773" "CV Delta 13.3745769694536"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + biome_name)
[1] "R2 0.223013658291514"
[1] "CV Delta 5.9146455418679" "CV Delta 5.91040383901086"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + biome_name + realm)
[1] "R2 0.404911112981243"
[1] "CV Delta 14.2088898476971" "CV Delta 14.1942054055947"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + biome_name + realm)
[1] "R2 0.282011390214033"
[1] "CV Delta 5.61291700874678" "CV Delta 5.6084158772121"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + population_growth)
[1] "R2 0.286929092142329"
[1] "CV Delta 13.5753866440016" "CV Delta 13.5727869610531"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + population_growth)
[1] "R2 0.134196770612853"
[1] "CV Delta 5.73874002430138" "CV Delta 5.73766284305409"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + population_growth + region_50km_ssm)
[1] "R2 0.290846971284311"
[1] "CV Delta 13.7387732818639" "CV Delta 13.7352820567926"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + population_growth + region_50km_ssm)
[1] "R2 0.151946781581196"
[1] "CV Delta 5.6860363390934" "CV Delta 5.68473754059936"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm)
[1] "R2 0.409838701070553"
[1] "CV Delta 14.5182790115777" "CV Delta 14.5020009668314"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm)
[1] "R2 0.287701159199758"
[1] "CV Delta 5.81750910931357" "CV Delta 5.81199090164147"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + region_50km_ssm + biome_name + realm)
[1] "R2 0.408007484714431"
[1] "CV Delta 14.3038421650361" "CV Delta 14.2885697715507"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + region_50km_ssm + biome_name + realm)
[1] "R2 0.287698318614449"
[1] "CV Delta 5.66636239287125" "CV Delta 5.66147724370065"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + region_50km_ssm + realm)
[1] "R2 0.359678173824136"
[1] "CV Delta 13.2441153115841" "CV Delta 13.2375767587302"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + region_50km_ssm + realm)
[1] "R2 0.226422719550459"
[1] "CV Delta 5.38608965982802" "CV Delta 5.38414802481297"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + region_50km_ssm + biome_name)
[1] "R2 0.370916016196228"
[1] "CV Delta 13.5692480406954" "CV Delta 13.5602234710036"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + region_50km_ssm + biome_name)
[1] "R2 0.229572509441164"
[1] "CV Delta 5.99492133909033" "CV Delta 5.9901231822001"
AIC(
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + realm),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + biome_name),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + biome_name + realm),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + population_growth),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + region_50km_ssm),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + population_growth + region_50km_ssm),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + region_50km_ssm + biome_name + realm),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + region_50km_ssm + biome_name),
glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + region_50km_ssm + realm)
)
AIC(
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + realm),
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + biome_name),
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + biome_name + realm),
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + population_growth),
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + region_50km_ssm),
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm),
glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm)
)
merlin_city_data_named <- fetch_city_data_for('merlin', T)
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
name = col_character(),
response = col_double()
)
Joining, by = "name"
birdlife_city_data_named <- fetch_city_data_for('birdlife', T)
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
name = col_character(),
response = col_double()
)
Joining, by = "name"
merlin.fit <- glm(data = merlin_city_data_fixed, formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm)
plot(merlin.fit)
Warning: not plotting observations with leverage one:
113
jpeg("city_effect_merlin_model.jpg")
par(mfrow=c(2, 2))
plot(merlin.fit)
Warning: not plotting observations with leverage one:
113
dev.off()
null device
1
merlin_city_data_fixed[c(24, 30, 31),c("response", "biome_name", "realm", "region_50km_ssm", "population_growth", "merlin_pool_size")]
birdlife_city_data_fixed[c(24, 30, 31),c("response", "biome_name", "realm", "region_50km_ssm", "population_growth", "birdlife_pool_size")]
merlin_city_data_named$name[c(24, 30, 31)]
[1] "Casablanca" "Colombo" "Curitiba"
birdlife_city_data_named$name[c(24, 30, 31)]
[1] "Casablanca" "Colombo" "Curitiba"
mean(merlin_city_data_fixed$merlin_pool_size[merlin_city_data_fixed$biome_name == 'Mediterranean Forests, Woodlands & Scrub'])
[1] 240.5333
mean(merlin_city_data_fixed$merlin_pool_size[merlin_city_data_fixed$realm == 'Palearctic'])
[1] 216.3846
mean(merlin_city_data_fixed$merlin_pool_size[merlin_city_data_fixed$biome_name == 'Tropical & Subtropical Moist Broadleaf Forests'])
[1] 300.6136
mean(merlin_city_data_fixed$merlin_pool_size[merlin_city_data_fixed$realm == 'Indomalayan'])
[1] 282.775
mean(merlin_city_data_fixed$merlin_pool_size[merlin_city_data_fixed$biome_name == 'Tropical & Subtropical Moist Broadleaf Forests'])
[1] 300.6136
mean(merlin_city_data_fixed$merlin_pool_size[merlin_city_data_fixed$realm == 'Neotropic'])
[1] 334.3333
test_model(merlin_city_data_fixed[-c(24, 30, 31, 113),], response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm)
[1] "R2 0.479991092783881"
[1] "CV Delta 10.665095263287" "CV Delta 10.6515112719514"
birdlife.fit <- glm(data = birdlife_city_data_fixed, formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm)
plot(birdlife.fit)
Warning: not plotting observations with leverage one:
113
jpeg("city_effect_birdlife_model.jpg")
par(mfrow=c(2, 2))
plot(birdlife.fit)
Warning: not plotting observations with leverage one:
113
dev.off()
null device
1
birdlife_city_data_fixed[c(16, 30, 53),c("response", "biome_name", "realm", "region_50km_ssm", "population_growth", "birdlife_pool_size")]
merlin_city_data_fixed[c(16, 30, 53),c("response", "biome_name", "realm", "region_50km_ssm", "population_growth", "merlin_pool_size")]
mean(birdlife_city_data_fixed$birdlife_pool_size[birdlife_city_data_fixed$biome_name == 'Temperate Broadleaf & Mixed Forests'])
[1] 232.7188
mean(birdlife_city_data_fixed$birdlife_pool_size[birdlife_city_data_fixed$realm == 'Palearctic'])
[1] 211.9231
mean(birdlife_city_data_fixed$birdlife_pool_size[birdlife_city_data_fixed$biome_name == 'Tropical & Subtropical Moist Broadleaf Forests'])
[1] 364.2727
mean(birdlife_city_data_fixed$birdlife_pool_size[birdlife_city_data_fixed$realm == 'Indomalayan'])
[1] 337.75
mean(birdlife_city_data_fixed$birdlife_pool_size[birdlife_city_data_fixed$biome_name == 'Tropical & Subtropical Dry Broadleaf Forests'])
[1] 310.2143
mean(birdlife_city_data_fixed$birdlife_pool_size[birdlife_city_data_fixed$realm == 'Indomalayan'])
[1] 337.75
birdlife_city_data_named$name[c(16, 30, 53)]
[1] "Birmingham" "Colombo" "Hyderabad"
test_model(birdlife_city_data_fixed[-c(16, 30, 53, 113),], response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm)
[1] "R2 0.32316086964121"
[1] "CV Delta 4.41986102308419" "CV Delta 4.41468426423238"
| But can we order cities based on how good they are for biodiversity? |
merlin_city_data_fixed$residuals <- resid(merlin.fit)
birdlife_city_data_fixed$residuals <- resid(birdlife.fit)
ggplot(merlin_city_data_fixed, aes(y = response, x = residuals)) + geom_point() + geom_smooth(method = "lm", se = F)
`geom_smooth()` using formula 'y ~ x'
cor(merlin_city_data_fixed$response, merlin_city_data_fixed$residuals)
[1] 0.7665527
ggplot(birdlife_city_data_fixed, aes(y = response, x = residuals)) + geom_point() + geom_smooth(method = "lm", se = F)
`geom_smooth()` using formula 'y ~ x'
cor(birdlife_city_data_fixed$response, birdlife_city_data_fixed$residuals)
[1] 0.8398848
ordered_cities <- data.frame(
ranked_performance = 1:nrow(merlin_city_data_named),
merlin_base_response = merlin_city_data_named$name[order(-merlin_city_data$response)],
birdlife_base_response = merlin_city_data_named$name[order(-birdlife_city_data$response)],
merlin_model_residuals = merlin_city_data_named$name[order(-merlin_city_data$residuals)],
birdlife_model_residuals = merlin_city_data_named$name[order(-birdlife_city_data$residuals)]
)
ordered_cities
write_csv(ordered_cities, "city_effect_residuals.csv")
| What is going on with the response? |
library(ggrepel)
merlin_city_data_fixed$name <- merlin_city_data_named$name
plot_merlin_poolsize <- ggplot(merlin_city_data_fixed, aes(y = response, x = merlin_pool_size)) +
geom_smooth(method = "lm", se = F) +
geom_point(aes(color = residuals), size = 4) +
geom_label_repel(aes(label = name), size = 4) +
xlab("Pool Size") + ylab("City Random Effect Response") +
guides(color=guide_legend(title="Model residuals 'response ~ pool_size'")) +
theme_bw() + theme(legend.position="bottom", legend.title=element_text(size=9), legend.text=element_text(size=8), legend.key.size = unit(1,"line")) +
labs(title = "Merlin response given pool size")
plot_merlin_poolsize
`geom_smooth()` using formula 'y ~ x'
Warning: ggrepel: 123 unlabeled data points (too many overlaps). Consider increasing max.overlaps
birdlife_city_data_fixed$name <- birdlife_city_data_named$name
plot_birdlife_poolsize <- ggplot(birdlife_city_data_fixed, aes(y = response, x = birdlife_pool_size)) +
geom_smooth(method = "lm", se = F) +
geom_point(aes(color = residuals), size = 4) +
geom_label_repel(aes(label = name), size = 4) +
xlab("Pool Size") + ylab("City Random Effect Response") +
guides(color=guide_legend(title="Model residuals 'response ~ pool_size'")) +
theme_bw() + theme(legend.position="bottom", legend.title=element_text(size=9), legend.text=element_text(size=8), legend.key.size = unit(1,"line")) +
labs(title = "Birdlife response given pool size")
plot_birdlife_poolsize
`geom_smooth()` using formula 'y ~ x'
Warning: ggrepel: 114 unlabeled data points (too many overlaps). Consider increasing max.overlaps
| Summary of models |
summary(merlin.fit)
Call:
glm(formula = response ~ merlin_pool_size + population_growth +
region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed)
Deviance Residuals:
Min 1Q Median 3Q Max
-7.5836 -1.9802 -0.2806 1.4883 16.1666
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.558383 4.200653 0.609 0.5437
merlin_pool_size -0.027339 0.003575 -7.647 6.55e-12 ***
population_growth 0.003068 0.005114 0.600 0.5497
region_50km_ssm -0.053173 0.072982 -0.729 0.4677
biome_nameDeserts & Xeric Shrublands 4.605968 3.868850 1.191 0.2363
biome_nameFlooded Grasslands & Savannas 0.525908 4.481070 0.117 0.9068
biome_nameMangroves 8.441618 4.591210 1.839 0.0685 .
biome_nameMediterranean Forests, Woodlands & Scrub 4.145677 3.732373 1.111 0.2690
biome_nameMontane Grasslands & Shrublands 5.023979 4.774921 1.052 0.2949
biome_nameTemperate Broadleaf & Mixed Forests 4.686888 3.622288 1.294 0.1983
biome_nameTemperate Conifer Forests 4.317564 4.479751 0.964 0.3372
biome_nameTemperate Grasslands, Savannas & Shrublands 5.637210 4.037582 1.396 0.1653
biome_nameTropical & Subtropical Coniferous Forests 7.544896 4.609955 1.637 0.1044
biome_nameTropical & Subtropical Dry Broadleaf Forests 4.834888 3.977950 1.215 0.2267
biome_nameTropical & Subtropical Grasslands, Savannas & Shrublands 7.209246 4.186447 1.722 0.0877 .
biome_nameTropical & Subtropical Moist Broadleaf Forests 4.084507 3.830228 1.066 0.2885
realmAustralasia -0.633465 2.622994 -0.242 0.8096
realmIndomalayan 1.301503 1.655451 0.786 0.4334
realmNearctic 2.083151 1.879997 1.108 0.2701
realmNeotropic 2.585444 1.767718 1.463 0.1463
realmPalearctic -0.323305 1.843458 -0.175 0.8611
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 12.50991)
Null deviance: 2469.6 on 136 degrees of freedom
Residual deviance: 1451.1 on 116 degrees of freedom
AIC: 756.13
Number of Fisher Scoring iterations: 2
summary(birdlife.fit)
Call:
glm(formula = response ~ birdlife_pool_size + population_growth +
region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.1697 -1.2864 -0.2075 0.8359 9.4606
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.639e+00 2.811e+00 1.295 0.1980
birdlife_pool_size -1.298e-02 2.705e-03 -4.798 4.82e-06 ***
population_growth -7.172e-05 3.334e-03 -0.022 0.9829
region_50km_ssm -4.483e-02 4.664e-02 -0.961 0.3385
biome_nameDeserts & Xeric Shrublands 3.037e+00 2.499e+00 1.215 0.2267
biome_nameFlooded Grasslands & Savannas 5.831e-01 2.904e+00 0.201 0.8412
biome_nameMangroves 3.282e+00 2.980e+00 1.101 0.2730
biome_nameMediterranean Forests, Woodlands & Scrub 2.506e+00 2.415e+00 1.038 0.3015
biome_nameMontane Grasslands & Shrublands 2.011e+00 3.094e+00 0.650 0.5170
biome_nameTemperate Broadleaf & Mixed Forests 3.067e+00 2.345e+00 1.308 0.1934
biome_nameTemperate Conifer Forests 4.456e+00 2.907e+00 1.533 0.1280
biome_nameTemperate Grasslands, Savannas & Shrublands 4.342e+00 2.616e+00 1.660 0.0996 .
biome_nameTropical & Subtropical Coniferous Forests 3.878e+00 2.988e+00 1.298 0.1969
biome_nameTropical & Subtropical Dry Broadleaf Forests 3.043e+00 2.570e+00 1.184 0.2389
biome_nameTropical & Subtropical Grasslands, Savannas & Shrublands 1.675e+00 2.724e+00 0.615 0.5400
biome_nameTropical & Subtropical Moist Broadleaf Forests 1.819e+00 2.482e+00 0.733 0.4651
realmAustralasia -1.882e+00 1.700e+00 -1.107 0.2706
realmIndomalayan -8.270e-01 1.076e+00 -0.769 0.4436
realmNearctic -2.992e+00 1.228e+00 -2.437 0.0163 *
realmNeotropic -6.657e-01 1.143e+00 -0.582 0.5615
realmPalearctic -2.961e+00 1.225e+00 -2.417 0.0172 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 5.26285)
Null deviance: 865.45 on 136 degrees of freedom
Residual deviance: 610.49 on 116 degrees of freedom
AIC: 637.51
Number of Fisher Scoring iterations: 2
| Review anovas |
birdlife.biome.anovoa <- aov(response ~ biome_name, data=birdlife_city_data_fixed)
summary(birdlife.biome.anovoa)
Df Sum Sq Mean Sq F value Pr(>F)
biome_name 12 98.7 8.225 1.33 0.21
Residuals 124 766.7 6.183
merlin.biome.anovoa <- aov(response ~ biome_name, data=merlin_city_data_fixed)
summary(merlin.biome.anovoa)
Df Sum Sq Mean Sq F value Pr(>F)
biome_name 12 212.3 17.69 0.972 0.479
Residuals 124 2257.3 18.20
birdlife.realm.anovoa <- aov(response ~ realm, data=birdlife_city_data_fixed)
summary(birdlife.realm.anovoa)
Df Sum Sq Mean Sq F value Pr(>F)
realm 5 0.0 0.000 0 1
Residuals 131 865.4 6.606
merlin.realm.anovoa <- aov(response ~ realm, data=merlin_city_data_fixed)
summary(merlin.realm.anovoa)
Df Sum Sq Mean Sq F value Pr(>F)
realm 5 0 0.00 0 1
Residuals 131 2470 18.85
interaction.plot(merlin_city_data_fixed$realm, merlin_city_data_fixed$biome_name, merlin_city_data_fixed$response)
meriin.addative.anova <- aov(response ~ biome_name + realm, data=merlin_city_data_fixed)
summary(meriin.addative.anova)
Df Sum Sq Mean Sq F value Pr(>F)
biome_name 12 212.3 17.692 0.938 0.511
realm 5 13.9 2.785 0.148 0.980
Residuals 119 2243.4 18.852
interaction.plot(merlin_city_data_fixed$realm, merlin_city_data_fixed$biome_name, merlin_city_data_fixed$response)
meriin.interaction.anova <- aov(response ~ biome_name * realm, data=merlin_city_data_fixed)
summary(meriin.interaction.anova)
Df Sum Sq Mean Sq F value Pr(>F)
biome_name 12 212.3 17.692 0.890 0.559
realm 5 13.9 2.785 0.140 0.983
biome_name:realm 13 136.3 10.487 0.528 0.903
Residuals 106 2107.1 19.878
ggplot(merlin_city_data_fixed, aes(x = response, y = realm)) + geom_boxplot()
library("stringr")
ggplot(merlin_city_data_fixed, aes(x = response, y = biome_name)) +
geom_boxplot() +
facet_wrap(~ realm, scales = "free") +
theme(text = element_text(size = 30), legend.text=element_text(size=20)) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))
ggplot(birdlife_city_data_fixed, aes(x = response, y = biome_name)) +
geom_boxplot() +
facet_wrap(~ realm, scales = "free") +
theme(text = element_text(size = 30), legend.text=element_text(size=20)) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 10))
ggplot(merlin_city_data_fixed, aes(y = response, x = region_50km_ssm, color = realm)) + geom_point() + geom_smooth(method = "lm", se = F)
`geom_smooth()` using formula 'y ~ x'
ggplot(merlin_city_data_fixed, aes(y = response, x = region_50km_ssm, color = biome_name)) + geom_point() + geom_smooth(method = "lm", se = F)
`geom_smooth()` using formula 'y ~ x'
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm)
[1] "R2 0.409838701070553"
[1] "CV Delta 14.5182790115777" "CV Delta 14.5020009668315"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm)
[1] "R2 0.287701159199758"
[1] "CV Delta 5.81750910931357" "CV Delta 5.81199090164147"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + population_growth + region_50km_ssm * biome_name * realm)
[1] "R2 0.519307720872686"
[1] "CV Delta 13363.0206446473" "CV Delta 13264.8507709478"
test_model(birdlife_city_data_fixed_no_boreal, response ~ birdlife_pool_size + population_growth + region_50km_ssm * biome_name * realm)
[1] "R2 0.432634735857367"
[1] "CV Delta 11005.7362693327" "CV Delta 10924.8441449411"
test_model(merlin_city_data_fixed_no_boreal, response ~ merlin_pool_size + region_50km_ssm * biome_name * realm)
[1] "R2 0.519293181227938"
[1] "CV Delta 13625.5384965618" "CV Delta 13525.4374613711"
min(merlin_city_data$response)
[1] -9.204166
max(merlin_city_data$response)
[1] 18.43521
min(birdlife_city_data$response)
[1] -4.910185
max(birdlife_city_data$response)
[1] 10.45823
| Maybe the either pool is missing the random pool sizes? |
either_city_data <- fetch_city_data_for('either')
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
name = col_character(),
response = col_double()
)
Joining, by = "name"
either_city_data
either_city_data_fixed <- rfImpute(response ~ ., either_city_data)
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 4.843 95.24 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 4.704 92.50 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 5.006 98.43 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 4.796 94.32 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 4.824 94.87 |
either_city_data_fixed
select_variables_from_random_forest(either_city_data_fixed)
[1] "either_pool_size" "region_50km_ssm" "population_growth"
[4] "region_100km_ssm" "realm" "region_100km_cultivated"
[7] "region_20km_average_pop_density" "region_100km_susm" "region_20km_ssm"
[10] "region_50km_cultivated" "temperature_monthly_min" "region_50km_elevation_delta"
[13] "region_50km_average_pop_density" "shrubs" "city_ssm"
[16] "region_20km_susm" "permanent_water" "region_100km_average_pop_density"
[19] "region_50km_susm" "rainfall_monthly_min" "mean_population_exposure_to_pm2_5_2019"
[22] "region_20km_cultivated" "biome_name" "share_of_population_within_400m_of_open_space"
[25] "region_20km_elevation_delta" "rainfall_monthly_max" "temperature_annual_average"
[28] "city_elevation_delta" "percentage_urban_area_as_open_public_spaces_and_streets" "happiness_positive_effect"
[31] "city_susm" "city_average_pop_density" "region_100km_elevation_delta"
[34] "region_20km_ndvi" "region_50km_ndvi" "city_max_pop_density"
[37] "city_mean_elevation" "percentage_urban_area_as_open_public_spaces" "region_20km_urban"
[40] "region_100km_urban" "temperature_monthly_max" "region_20km_percentage_protected"
[43] "region_50km_percentage_protected" "rainfall_annual_average" "herbaceous_wetland"
[46] "region_50km_mean_elevation" "region_20km_mean_elevation" "region_50km_urban"
[49] "city_ndvi" "region_100km_mean_elevation" "cultivated"
[52] "city_percentage_protected" "happiness_future_life" "region_100km_ndvi"
[55] "happiness_negative_effect" "open_forest" "urban"
[58] "herbaceous_vegetation" "closed_forest" "percentage_urban_area_as_streets"
select_variables_from_random_forest(either_city_data_fixed_single_scale)
[1] "either_pool_size" "region_50km_ssm" "population_growth"
[4] "region_100km_cultivated" "realm" "region_20km_average_pop_density"
[7] "region_100km_susm" "city_ssm" "temperature_monthly_min"
[10] "shrubs" "region_50km_elevation_delta" "biome_name"
[13] "permanent_water" "rainfall_monthly_min" "share_of_population_within_400m_of_open_space"
[16] "percentage_urban_area_as_open_public_spaces" "mean_population_exposure_to_pm2_5_2019" "temperature_annual_average"
[19] "city_average_pop_density" "percentage_urban_area_as_open_public_spaces_and_streets" "happiness_positive_effect"
[22] "city_mean_elevation" "region_20km_ndvi" "city_ndvi"
[25] "rainfall_annual_average" "city_susm" "region_50km_mean_elevation"
[28] "temperature_monthly_max" "open_forest" "happiness_negative_effect"
[31] "herbaceous_vegetation" "closed_forest" "percentage_urban_area_as_streets"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size")])
[1] "Mean 4.69266103594363 , SD: 0.0510981914922011 , Mean + SD: 4.74375922743583"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm")])
[1] "Mean 4.31336571084706 , SD: 0.0477119635734014 , Mean + SD: 4.36107767442046"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth")])
[1] "Mean 3.84257546120148 , SD: 0.0583801219753036 , Mean + SD: 3.90095558317678"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated")])
[1] "Mean 3.97972587397746 , SD: 0.0610228479845339 , Mean + SD: 4.04074872196199"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm")])
[1] "Mean 3.71041026502135 , SD: 0.0604558222228108 , Mean + SD: 3.77086608724416"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density")])
[1] "Mean 3.66339757092933 , SD: 0.0663715820467887 , Mean + SD: 3.72976915297612"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm")])
[1] "Mean 3.85235242406019 , SD: 0.0679378104226804 , Mean + SD: 3.92029023448287"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm")])
[1] "Mean 4.00570750061168 , SD: 0.0578080868110856 , Mean + SD: 4.06351558742276"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min")])
[1] "Mean 3.92441683942565 , SD: 0.0710013584983004 , Mean + SD: 3.99541819792395"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs")])
[1] "Mean 3.93883898944311 , SD: 0.0771381589247996 , Mean + SD: 4.01597714836791"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta")])
[1] "Mean 3.97887705384933 , SD: 0.0641411182593364 , Mean + SD: 4.04301817210867"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name")])
[1] "Mean 4.12424218224607 , SD: 0.0654822740801436 , Mean + SD: 4.18972445632622"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water")])
[1] "Mean 4.13307783362438 , SD: 0.0520988790244683 , Mean + SD: 4.18517671264885"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min")])
[1] "Mean 4.14763343564045 , SD: 0.0712446152059106 , Mean + SD: 4.21887805084636"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space")])
[1] "Mean 4.1661488519461 , SD: 0.0659605155787169 , Mean + SD: 4.23210936752482"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_open_public_spaces")])
[1] "Mean 4.22755212132415 , SD: 0.0750663802220243 , Mean + SD: 4.30261850154617"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_open_public_spaces", "mean_population_exposure_to_pm2_5_2019")])
[1] "Mean 4.23552884460268 , SD: 0.0745476529202964 , Mean + SD: 4.31007649752298"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_open_public_spaces", "mean_population_exposure_to_pm2_5_2019", "temperature_annual_average")])
[1] "Mean 4.26508667731091 , SD: 0.0773348834939869 , Mean + SD: 4.3424215608049"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_open_public_spaces", "mean_population_exposure_to_pm2_5_2019", "temperature_annual_average", "city_average_pop_density")])
[1] "Mean 4.28326756101597 , SD: 0.0755017970060168 , Mean + SD: 4.35876935802199"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_open_public_spaces", "mean_population_exposure_to_pm2_5_2019", "temperature_annual_average", "city_average_pop_density", "percentage_urban_area_as_open_public_spaces_and_streets")])
[1] "Mean 4.34654977254916 , SD: 0.0660243511076323 , Mean + SD: 4.41257412365679"
create_fifty_rows_of_oob(either_city_data_fixed[,c("response", "either_pool_size", "region_50km_ssm", "population_growth", "region_100km_cultivated", "realm", "region_20km_average_pop_density", "region_100km_susm", "city_ssm", "temperature_monthly_min", "shrubs", "region_50km_elevation_delta", "biome_name", "permanent_water", "rainfall_monthly_min", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_open_public_spaces", "mean_population_exposure_to_pm2_5_2019", "temperature_annual_average", "city_average_pop_density", "percentage_urban_area_as_open_public_spaces_and_streets", "happiness_positive_effect")])
[1] "Mean 4.35777574870304 , SD: 0.0710294002562521 , Mean + SD: 4.42880514895929"
“either_pool_size”, “region_50km_ssm”, “population_growth”, “region_100km_cultivated”, “realm”, “region_20km_average_pop_density”
test_model(either_city_data_fixed, response ~ either_pool_size)
[1] "R2 0.200828561945557"
[1] "CV Delta 4.17185408588075" "CV Delta 4.17145621339379"
test_model(either_city_data_fixed, response ~ either_pool_size + region_50km_ssm)
[1] "R2 0.218556419678468"
[1] "CV Delta 4.1385005044194" "CV Delta 4.13789124054347"
test_model(either_city_data_fixed, response ~ either_pool_size + region_50km_ssm + population_growth)
[1] "R2 0.218651209733749"
[1] "CV Delta 4.24526922992079" "CV Delta 4.24423824005575"
test_model(either_city_data_fixed, response ~ either_pool_size + region_50km_ssm + population_growth + region_100km_cultivated)
[1] "R2 0.240163336907313"
[1] "CV Delta 4.1925470002567" "CV Delta 4.19130237140134"
test_model(either_city_data_fixed, response ~ either_pool_size + region_50km_ssm + population_growth + region_100km_cultivated + realm)
[1] "R2 0.337735042304765"
[1] "CV Delta 3.90417611638399" "CV Delta 3.90212025458098"
test_model(either_city_data_fixed, response ~ either_pool_size + region_50km_ssm + population_growth + region_100km_cultivated + realm + region_20km_average_pop_density)
[1] "R2 0.348335268589981"
[1] "CV Delta 3.89011193891222" "CV Delta 3.88782718460849"
cor(either_city_data_fixed$residuals, either_city_data_fixed$response)
[1] 0.8072575
mean(either_city_data_fixed$either_pool_size[either_city_data_fixed$biome_name == 'Temperate Broadleaf & Mixed Forests'])
[1] 293.4062
mean(either_city_data_fixed$either_pool_size[either_city_data_fixed$realm == 'Palearctic'])
[1] 274.0769
mean(either_city_data_fixed$either_pool_size[either_city_data_fixed$biome_name == 'Tropical & Subtropical Moist Broadleaf Forests'])
[1] 428.25
mean(either_city_data_fixed$either_pool_size[either_city_data_fixed$realm == 'Indomalayan'])
[1] 395.225
mean(either_city_data_fixed$either_pool_size[either_city_data_fixed$biome_name == 'Tropical & Subtropical Dry Broadleaf Forests'])
[1] 379.5
mean(either_city_data_fixed$either_pool_size[either_city_data_fixed$realm == 'Indomalayan'])
[1] 395.225
birdlife_city_data_named$name[c(16, 30, 53)]
[1] "Birmingham" "Colombo" "Hyderabad"
plot(either.fit.2)
Warning: not plotting observations with leverage one:
113
| Use cross validation and dropping terms to find best model |
predict <- NULL
error <- NULL
for(i in 1:nrow(merlin_city_data_fixed_no_boreal)) {
fit <- glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal[-c(i),])
predict[i] <- predict(fit, newdata=merlin_city_data_fixed_no_boreal[c(i),])
error[i] <- (predict[i] - merlin_city_data_fixed_no_boreal$response[c(i)])^2
}
mean(error)
[1] 14.51828
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.51828
– Can we drop one?
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.8059
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.46984
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.39671
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.30384
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 20.17595
– drop populaion_growth (14.30384)
– can we drop another?
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + region_50km_ssm + biome_name, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.56925
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + region_50km_ssm + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.24412
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.20889
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 20.17595
– drop ssm (14.20889)
– can we drop another?
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + biome_name, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.38282
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.10131
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 20.56763
– drop biome (13.10131)
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.29241
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 19.48513
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size * realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.85502
| – best model with merlin is pool size + realm (CV error 13.10131) |
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.817509
– can we drop a variable?
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.1237
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.492536
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.764969
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.666362
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ population_growth + region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 7.061169
– drop biome (5.492536)
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.686036
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.499406
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.38609
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ population_growth + region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.690224
– drop population growth (5.38609)
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + region_50km_ssm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.577701
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.38033
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.637624
– drop ssm (5.38033)
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.613765
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.746644
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size * realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
| – so best model with birdlife is pool size + realm |
ggplot(merlin_city_data_fixed_no_boreal, aes(x = merlin_pool_size, y = response)) + geom_point() + geom_smooth(method = "glm", se = F) + facet_wrap(~ realm)
`geom_smooth()` using formula 'y ~ x'
ggplot(birdlife_city_data_fixed_no_boreal, aes(x = birdlife_pool_size, y = response)) + geom_point() + geom_smooth(method = "glm", se = F) + facet_wrap(~ realm)
`geom_smooth()` using formula 'y ~ x'
birdlife.fit <- glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + realm)
summary(birdlife.fit)
Call:
glm(formula = response ~ birdlife_pool_size + realm, data = birdlife_city_data_fixed_no_boreal)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.9240 -1.2855 -0.3158 0.8601 9.4227
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.724170 1.182215 4.842 3.61e-06 ***
birdlife_pool_size -0.014963 0.002513 -5.955 2.31e-08 ***
realmAustralasia -1.504499 1.508076 -0.998 0.32033
realmIndomalayan -0.670291 0.785239 -0.854 0.39490
realmNearctic -1.963875 0.918121 -2.139 0.03432 *
realmNeotropic -0.288102 0.832533 -0.346 0.72987
realmPalearctic -2.474092 0.891147 -2.776 0.00632 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 5.210381)
Null deviance: 857.07 on 135 degrees of freedom
Residual deviance: 672.14 on 129 degrees of freedom
AIC: 619.25
Number of Fisher Scoring iterations: 2
with(summary(birdlife.fit), 1 - deviance/null.deviance)
[1] 0.2157718
plot(birdlife.fit)
ggplot(birdlife_city_data_fixed_no_boreal, aes(x = birdlife_pool_size, y = response)) +
geom_point() +
geom_smooth(method = "glm", se = F) +
geom_text(aes(label = name), data = birdlife_city_data_fixed_no_boreal[c(16, 53, 124),], size = 3, position = "dodge", vjust = "inward", hjust = "inward", color = "red", angle=-15) +
geom_point(data = birdlife_city_data_fixed_no_boreal[c(16, 53, 124),], color = "red") +
facet_wrap(~ realm) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
merlin.fit <- glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + realm)
summary(merlin.fit)
Call:
glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed_no_boreal)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.5389 -1.7132 -0.5164 1.3898 16.2335
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.823312 1.406157 5.564 1.46e-07 ***
merlin_pool_size -0.028685 0.003401 -8.434 5.79e-14 ***
realmAustralasia -1.923665 2.294340 -0.838 0.403
realmIndomalayan 0.288224 1.193798 0.241 0.810
realmNearctic 1.499468 1.327678 1.129 0.261
realmNeotropic 1.767199 1.293308 1.366 0.174
realmPalearctic -1.490884 1.214536 -1.228 0.222
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 12.28537)
Null deviance: 2458.9 on 135 degrees of freedom
Residual deviance: 1584.8 on 129 degrees of freedom
AIC: 735.91
Number of Fisher Scoring iterations: 2
with(summary(merlin.fit), 1 - deviance/null.deviance)
[1] 0.3554797
plot(merlin.fit)
ggplot(merlin_city_data_fixed_no_boreal, aes(x = merlin_pool_size, y = response)) +
geom_point(size = 1) +
geom_smooth(method = "glm", se = F) +
geom_text(aes(label = name), data = merlin_city_data_fixed_no_boreal[c(7, 24, 77),], size = 3, position = "dodge", vjust = "inward", hjust = "inward", color = "red", angle=-15) +
geom_point(data = merlin_city_data_fixed_no_boreal[c(7, 24, 77),], color = "red") +
facet_wrap(~ realm) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
ggplot(merlin_city_data_fixed_no_boreal, aes(y = response, x = residuals)) +
geom_smooth(method = "lm", se = F) +
geom_point(aes(color = realm)) +
geom_text(aes(label = name), data = merlin_city_data_fixed_no_boreal[c(7, 24, 77),], size = 4, position = "dodge", vjust = "inward", hjust = "inward") +
labs(title = "Merlin", subtitle = paste("Correlation", cor(merlin_city_data_fixed_no_boreal$residuals, merlin_city_data_fixed_no_boreal$response))) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
ggplot(birdlife_city_data_fixed_no_boreal, aes(y = response, x = residuals)) +
geom_smooth(method = "lm", se = F, alpha = 0.5) +
geom_point(aes(color = realm)) +
geom_text(aes(label = name), data = birdlife_city_data_fixed_no_boreal[c(16, 53, 124),], size = 4, position = "dodge", vjust = "inward", hjust = "inward") +
labs(title = "Birdlife", subtitle = paste("Correlation", cor(birdlife_city_data_fixed_no_boreal$residuals, birdlife_city_data_fixed_no_boreal$response))) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
summary(birdlife.aov)
Df Sum Sq Mean Sq F value Pr(>F)
birdlife_city_data_fixed_no_boreal$realm 5 0.16 0.0315 0.017 1
Residuals 130 246.42 1.8956
merlin.aov <- aov(merlin_city_data_fixed_no_boreal$residuals_of_fit ~ merlin_city_data_fixed_no_boreal$realm)
summary(merlin.aov)
Df Sum Sq Mean Sq F value Pr(>F)
merlin_city_data_fixed_no_boreal$realm 5 0.2 0.04 0.005 1
Residuals 130 1007.6 7.75
shapiro.test(birdlife_city_data_fixed_no_boreal$response)
Shapiro-Wilk normality test
data: birdlife_city_data_fixed_no_boreal$response
W = 0.93733, p-value = 8.842e-06
shapiro.test(birdlife_city_data_fixed_no_boreal$residuals)
Shapiro-Wilk normality test
data: birdlife_city_data_fixed_no_boreal$residuals
W = 0.9341, p-value = 5.274e-06
shapiro.test(birdlife_city_data_fixed_no_boreal$residuals_of_fit)
Shapiro-Wilk normality test
data: birdlife_city_data_fixed_no_boreal$residuals_of_fit
W = 0.96887, p-value = 0.003322
shapiro.test(merlin_city_data_fixed_no_boreal$response)
Shapiro-Wilk normality test
data: merlin_city_data_fixed_no_boreal$response
W = 0.95308, p-value = 0.0001366
shapiro.test(merlin_city_data_fixed_no_boreal$residuals)
Shapiro-Wilk normality test
data: merlin_city_data_fixed_no_boreal$residuals
W = 0.9243, p-value = 1.19e-06
shapiro.test(merlin_city_data_fixed_no_boreal$residuals_of_fit)
Shapiro-Wilk normality test
data: merlin_city_data_fixed_no_boreal$residuals_of_fit
W = 0.95023, p-value = 8.081e-05
Variances of groups are NOT significantly different:
bartlett.test(merlin_city_data_fixed_no_boreal$residuals, merlin_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: merlin_city_data_fixed_no_boreal$residuals and merlin_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 10.491, df = 5, p-value = 0.06246
leveneTest(merlin_city_data_fixed_no_boreal$residuals, merlin_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 1.0579 0.3868
130
Variances of groups are NOT significantly different:
bartlett.test(birdlife_city_data_fixed_no_boreal$residuals, birdlife_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: birdlife_city_data_fixed_no_boreal$residuals and birdlife_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 13.866, df = 5, p-value = 0.01648
leveneTest(birdlife_city_data_fixed_no_boreal$residuals, birdlife_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 1.1694 0.3277
130
Variances of groups are significantly different:
bartlett.test(merlin_city_data_fixed_no_boreal$residuals_of_fit, merlin_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: merlin_city_data_fixed_no_boreal$residuals_of_fit and merlin_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 71.202, df = 5, p-value = 5.76e-14
leveneTest(merlin_city_data_fixed_no_boreal$residuals_of_fit, merlin_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 10.565 1.576e-08 ***
130
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Variances of groups are significantly different:
bartlett.test(birdlife_city_data_fixed_no_boreal$residuals_of_fit, birdlife_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: birdlife_city_data_fixed_no_boreal$residuals_of_fit and birdlife_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 47.349, df = 5, p-value = 4.824e-09
leveneTest(birdlife_city_data_fixed_no_boreal$residuals_of_fit, birdlife_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 11.458 3.558e-09 ***
130
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
unique(merlin_city_data_fixed_no_boreal$realm)
[1] Afrotropic Indomalayan Palearctic Neotropic Nearctic Australasia
Levels: Afrotropic Australasia Indomalayan Nearctic Neotropic Palearctic
for(i in 1:length(realms)) {
realm <- realms[i]
delta <- cv.glm(
data = merlin_city_data_fixed_no_boreal[merlin_city_data_fixed_no_boreal$realm != realm,],
glm(
data = merlin_city_data_fixed_no_boreal[merlin_city_data_fixed_no_boreal$realm != realm,],
formula = response ~ merlin_pool_size + realm
)
)$delta[1]
print(paste("Exclude Realm CV", realm, delta))
}
[1] "Exclude Realm CV Afrotropic 12.7706233346667"
[1] "Exclude Realm CV Indomalayan 12.9137476664078"
[1] "Exclude Realm CV Palearctic 12.4964911080885"
[1] "Exclude Realm CV Neotropic 12.6929671331165"
[1] "Exclude Realm CV Nearctic 14.8108152865969"
[1] "Exclude Realm CV Australasia 12.8293479262406"
for(i in 1:length(realms)) {
realm <- realms[i]
delta <- cv.glm(
data = birdlife_city_data_fixed_no_boreal[birdlife_city_data_fixed_no_boreal$realm != realm,],
glm(
data = birdlife_city_data_fixed_no_boreal[birdlife_city_data_fixed_no_boreal$realm != realm,],
formula = response ~ birdlife_pool_size + realm
)
)$delta[1]
print(paste("Exclude Realm CV", realm, delta))
}
[1] "Exclude Realm CV Afrotropic 5.72613070569415"
[1] "Exclude Realm CV Indomalayan 4.93246008728438"
[1] "Exclude Realm CV Palearctic 4.86504674156399"
[1] "Exclude Realm CV Neotropic 5.2626582383154"
[1] "Exclude Realm CV Nearctic 5.87005564586449"
[1] "Exclude Realm CV Australasia 5.49824641265513"